from __future__ import print_function
import os.path
import dalmatian as dm
import pandas as pd
import gseapy
from gseapy.plot import gseaplot, heatmap, barplot, dotplot
import sys
sys.path.insert(0, '..')
import Datanalytics as da
import TerraFunction as terra
%load_ext autoreload
from Helper import *
%autoreload 2
%load_ext rpy2.ipython
from taigapy import TaigaClient
tc = TaigaClient()
import numpy as np
from bokeh.plotting import *
from bokeh.models import HoverTool
output_notebook()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from helper import pyDESeq2
from sklearn.neighbors import KNeighborsClassifier
from sklearn.manifold import MDS, TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from umap import UMAP
# biggest change genes across time point
# GSEA
#counts = pd.read_csv("data/expression.MAX_AMLproject.counts.tsv", sep='\t')
counts = pd.read_csv("data/MAX_AMLproject.rsem_genes_expected_count.txt", sep='\t')
#transcripts = pd.read_csv("data/expression.MAX_AMLproject.transcripts.tsv", sep='\t')b
counts.shape
# we have a number of non zero similar to nb of prot coding genes
np.count_nonzero(counts.values[:,2:])/30
counts.columns
filter some more
toremove = np.argwhere(counts[counts.columns.values[2:]].values.var(1)==0)
toremove.ravel()
counts = counts.drop(counts.index[toremove.ravel()],0)
counts = counts.reset_index()
counts = counts.drop(columns='index')
toremove = np.argwhere(counts[counts.columns.values[2:]].values.max(1)==0)
toremove.ravel()
counts = counts.drop(counts.index[toremove.ravel()],0)
counts = counts.reset_index()
counts = counts.drop(columns='index')
counts = counts.drop('transcript_id(s)',1)
counts['gene_id'] = convertGenes(counts['gene_id'])[0]
The avg amount of expressed genes in the samples
np.count_nonzero(counts.values[:,2:])/30
finding the different experiments
DMSO = '1'
MS2 = '2'
JQ1 = '3'
MS2_JQ1 = '4'
START = '0'
counts.columns
counts
DMSO = [val[0]==DMSO for val in counts.columns.values]
DMSO_t = [int(val.split('-')[1][:-1]) if val[0]==DMSO else 0 for val in counts.columns.values]
START = [val[0]==START for val in counts.columns.values]
MS2_t = [int(val.split('-')[1][:-1]) if val[0]==MS2 else 0 for val in counts.columns.values]
MS2_24 = [val == 24 for val in MS2_t]
MS2 = [val[0]==MS2 for val in counts.columns.values]
MS2_JQ1_t = [int(val.split('-')[1][:-1]) if val[0]==MS2_JQ1 else 0 for val in counts.columns.values]
MS2_JQ1 = [val[0]==MS2_JQ1 for val in counts.columns.values]
JQ1_t = [int(val.split('-')[1][:-1]) if val[0]==JQ1 else 0 for val in counts.columns.values]
JQ1 = [val[0]==JQ1 for val in counts.columns.values]
checks = [('0','0h'),('1','-8h'),('1','-24h'),('2','-2h'),('2','-4h'),('2','-8h'),
('2','-16h'),('2','-24h'),('3','-8h'),('4','-8h')]
design = pd.DataFrame(columns=counts.columns.values[1:],
index=['START','DMSO_8','DMSO_24','MS2_2','MS2_4','MS2_8', 'MS2_16', 'MS2_24', 'JQ1_8', 'MS2_JQ1_8'],
data=np.array(
[[1 if check[0]== val[0] and check[1] in val else 0 for val in counts.columns.values[1:]] for check in checks]))
design = design.T
design.index = ['X'+i.replace('-','.') for i in design.index]
res = {}
counts
design
design.sum(0)
for val in design.columns.values[1:]:
cols = [counts.columns[1:][i] for i, a in enumerate(design[['START',val]].values.sum(1)) if a]+['gene_id']
print(cols, val)
data = counts[cols]
data.columns = ['X'+i.replace('-','.') for i in data.columns]
d = design[['START',val]][design[['START',val]].sum(1)==1]
deseq = pyDESeq2.pyDESeq2(count_matrix=data,
design_matrix = d,
design_formula="~START - "+val,
gene_column="Xgene_id")
deseq.run_deseq()
deseq.get_deseq_result()
MS2res = deseq.deseq_result
MS2res.pvalue = np.nan_to_num(np.array(MS2res.pvalue), 1)
MS2res.log2FoldChange = - np.nan_to_num(np.array(MS2res.log2FoldChange), 0)
res[val] = MS2res
res.keys()
ctf=pd.read_csv('data/CTF.csv',header=None)[0].values.tolist()
ctf.remove('IKAROS')
ctf
for k, val in res.items():
a = volcano(val.rename(columns={'Xgene_id':'gene_id'}),tohighlight=ctf,title=k)
try:
show(a)
save(a,k+'.html')
except RuntimeError:
show(a)
for val in design.columns.values[4:]:
cols = [counts.columns[1:][i] for i, a in enumerate(design[['DMSO_8',val]].values.sum(1)) if a]+['gene_id']
print(cols, val)
data = counts[cols]
data.columns = ['X'+i.replace('-','.') for i in data.columns]
d = design[['DMSO_8',val]][design[['DMSO_8',val]].sum(1)==1]
deseq = pyDESeq2.pyDESeq2(count_matrix=data,
design_matrix = d,
design_formula="~DMSO_8 - "+val,
gene_column="Xgene_id")
deseq.run_deseq()
deseq.get_deseq_result()
MS2res = deseq.deseq_result
MS2res.pvalue = np.nan_to_num(np.array(MS2res.pvalue), 1)
MS2res.log2FoldChange = - np.nan_to_num(np.array(MS2res.log2FoldChange), 0)
res[val] = MS2res
for k, val in res.items():
if k in ['MS2_4','MS2_8','MS2_16','MS2_24','JQ1_8','MS2_JQ1_8']:
a = volcano(val.rename(columns={'Xgene_id':'gene_id'}),tohighlight=ctf,title=k)
try:
show(a)
save(a,k+'.html')
except RuntimeError:
show(a)
for val in design.columns[[6,7]]:
cols = [counts.columns[1:][i] for i, a in enumerate(design[['DMSO_24',val]].values.sum(1)) if a]+['gene_id']
print(cols, val)
data = counts[cols]
data.columns = ['X'+i.replace('-','.') for i in data.columns]
d = design[['DMSO_24',val]][design[['DMSO_24',val]].sum(1)==1]
deseq = pyDESeq2.pyDESeq2(count_matrix=data,
design_matrix = d,
design_formula="~DMSO_24 - "+val,
gene_column="Xgene_id")
deseq.run_deseq()
deseq.get_deseq_result()
MS2res = deseq.deseq_result
MS2res.pvalue = np.nan_to_num(np.array(MS2res.pvalue), 1)
MS2res.log2FoldChange = - np.nan_to_num(np.array(MS2res.log2FoldChange), 0)
res[val] = MS2res
for k, val in res.items():
if k in ['MS2_16','MS2_24']:
a = volcano(val.rename(columns={'Xgene_id':'gene_id'}),tohighlight=ctf,title=k)
try:
show(a)
save(a,k+'.html')
except RuntimeError:
show(a)
counts = counts.set_index('gene_id')
checks
names = list(res.keys())
res={}
for i, check in enumerate(checks[1:]):
print(check)
val = names[i]
totest = counts[[v for v in counts.columns if check[0] in v and check[1] in v or '0h' in v]]
cls = ['DMSO' if '0h' in v else 'Condition' for v in totest.columns]
res[val] = gseapy.gsea(data=totest, gene_sets='data/apoptosis.gmt',
cls= cls, no_plot=False, processes=6)
gseaplot(res[val].ranking, term=val, **res[val].results[terms[0]])
for i, check in enumerate(checks[1:]):
print(check)
val = names[i]
#totest = counts[[v for v in counts.columns if check[0] in v and check[1] in v or '0h' in v]]
#cls = ['DMSO' if '0h' in v else 'Condition' for v in totest.columns]
#res[val] = gseapy.gsea(data=totest, gene_sets='GO_Biological_Process_2015',
# cls= cls, no_plot=False, processes=6)
#res[val].res2d['Term'] = [i.split('(GO')[0][:50] for i in res[val].res2d.index]
sns.barplot(data=res[val].res2d.iloc[:25], x="es", y="Term",
hue_order="geneset_size").set_title(val)
val = names[0]
sns.barplot(data=res[val].res2d.iloc[:25], x="es", y="Term",
hue_order="geneset_size").set_title(val)
val = names[1]
sns.barplot(data=res[val].res2d.iloc[:25], x="es", y="Term",
hue_order="geneset_size").set_title(val)
val = names[2]
sns.barplot(data=res[val].res2d.iloc[:25], x="es", y="Term",
hue_order="geneset_size").set_title(val)
val = names[3]
sns.barplot(data=res[val].res2d.iloc[:25], x="es", y="Term",
hue_order="geneset_size").set_title(val)
val = names[4]
sns.barplot(data=res[val].res2d.iloc[:25], x="es", y="Term",
hue_order="geneset_size").set_title(val)
val = names[5]
sns.barplot(data=res[val].res2d.iloc[:25], x="es", y="Term",
hue_order="geneset_size").set_title(val)
val = names[6]
sns.barplot(data=res[val].res2d.iloc[:25], x="es", y="Term",
hue_order="geneset_size").set_title(val)
val = names[7]
sns.barplot(data=res[val].res2d.iloc[:25], x="es", y="Term",
hue_order="geneset_size").set_title(val)
It seems looking at both plottings of raw RNA data and Differential Expression analysis, that the RNA seq experiment is sound except for the DMSO at 8h and 24h which seemed to have been either contaminated or misslabelled.
We cannot see much differential expression after 16h due to a likely transcriptional collapse destroying our signal.
res
names